# only need to execute once !!!
install.packages("webshot")
webshot::install_phantomjs()pacman::p_load(dplyr, ggplot2, tm, SnowballC, wordcloud2, RColorBrewer,
plotly, stringr, d3heatmap, htmlwidgets,readr, maps, Matrix)load("data/X.rdata")summary(X) url sub date
Length:10760 Business & Finance :2249 Min. :2010-02-17
Class :character R&D :1857 1st Qu.:2013-06-03
Mode :character Grid Connection :1319 Median :2015-03-23
Authorities :1131 Mean :2015-03-05
Technology :1077 3rd Qu.:2017-03-02
Operations & Maintenance: 947 Max. :2019-04-12
(Other) :2180
title abstract author tags
Length:10760 Length:10760 Length:10760 Length:10760
Class :character Class :character Class :character Class :character
Mode :character Mode :character Mode :character Mode :character
text rov
Length:10760 Min. : 0.000
Class :character 1st Qu.: 0.000
Mode :character Median : 0.000
Mean : 0.033
3rd Qu.: 0.000
Max. :13.000
par(cex=0.8, mar=c(6,4,4,2))
hist(X$date, "year", freq=T, main="No. Articles per Year", las=2, xlab="")par(cex=0.8, mar=c(4,12,4,2))
table(X$sub) %>% sort %>%
barplot(las=2, horiz=T, main="No. Articles per Subject", xlab="freq")X %>%
mutate(year = as.integer(format(date,"%Y"))) %>%
group_by(year, sub) %>% count %>%
ggplot(aes(x=year, y=n, fill=sub)) +
geom_bar(stat="identity", position="fill") +
scale_x_continuous(breaks=2009:2019) -> p
ggplotly(p)stops = c(
stopwords("en"), "offshore", "wind", "energy", "will",
"said", "also", "can")
WC = function(subject, output, ROV=FALSE, min.freq=25, xstop=c(), ...) {
X1 = X %>% filter(sub == subject)
if(ROV) X1 = subset(X1, str_count(text, "ROV|ROUV")>0)
txt = iconv(X1$text, "latin1", "ASCII", sub="")
docs <- Corpus(VectorSource(txt))
docs <- tm_map(docs, content_transformer(tolower))
docs <- tm_map(docs, removeNumbers) #
docs <- tm_map(docs, removeWords,c(stops, xstop)) #
docs <- tm_map(docs, removePunctuation) #
docs <- tm_map(docs, stripWhitespace) #
dtm = TermDocumentMatrix(docs)
m = as.matrix(dtm)
v = sort(rowSums(m),decreasing=TRUE)
d = data.frame(word=names(v), freq=v)
hw = wordcloud2(subset(d, freq > min.freq), ... )
saveWidget(hw,"temp.html",selfcontained = F)
webshot::webshot("temp.html",output,vwidth=800, vheight=600, delay=20)
d
}7.png
9.png
pat = paste(iso3166$ISOname, collapse="|")
CX = str_extract_all(X$text, regex(pat, ignore.case=TRUE))
N9 = unlist(CX) %>% table %>% sort(dec=T) %>% {.[. > 9]} %>% names
C9 = lapply(CX, match, N9)
df = do.call(rbind, lapply(1:length(C9), function(i) {
if(length(C9[[i]]) > 0) data.frame(i = i, j = C9[[i]])
}))
df = subset(df, complete.cases(df))
df$x = 1
mx = sparseMatrix(i=df$i, j=df$j, x=df$x, dimnames=list(1:10760, N9)) %>%
as.data.frame.matrix
A = sapply(split(mx, X$sub), colSums)z = table(X$sub) %>% sort(dec=T) %>% names
X$sub = factor(X$sub, levels=z)
X$year = format(X$date, "%Y")mx = sparseMatrix(i=df$i, j=df$j, x=df$x, dimnames=list(1:10760, N9)) %>%
as.data.frame.matrix
A = sapply(split(mx, X$sub), colSums)
t(A[1:12,]) Germany Denmark China Netherlands Taiwan France Japan
Business & Finance 362 208 248 202 231 190 185
R&D 138 148 334 62 45 80 117
Grid Connection 296 120 30 112 8 69 22
Authorities 133 113 64 92 105 83 50
Technology 91 74 93 43 11 45 79
Operations & Maintenance 119 67 24 60 20 14 8
Vessels 66 41 54 51 24 12 6
Training & Education 68 51 11 27 18 5 10
Contracts & Tenders 54 33 17 58 65 4 18
Environment 16 12 10 14 18 1 2
Ports & Logistics 32 24 3 18 17 7 4
Jobs & Recruitment 2 6 3 2 4 0 0
Industry Contribution 6 1 4 5 19 2 5
Wind Farm Update 1 1 1 1 3 0 0
Norway Ireland Belgium India United States
Business & Finance 107 70 95 50 34
R&D 64 70 28 140 102
Grid Connection 95 37 61 4 7
Authorities 50 157 18 36 65
Technology 34 28 21 19 14
Operations & Maintenance 32 19 23 16 7
Vessels 32 3 25 0 6
Training & Education 6 23 0 6 7
Contracts & Tenders 8 0 14 8 3
Environment 6 6 2 15 10
Ports & Logistics 0 13 12 0 2
Jobs & Recruitment 0 3 0 0 1
Industry Contribution 3 0 1 1 1
Wind Farm Update 0 0 3 0 0
t(A)[,1:12] %>% as.data.frame.matrix %>% d3heatmap(F,F,col="Greens")